from sklearn.metrics import log_loss
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from pprint import pprint
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,roc_auc_score,confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix,r2_score
import warnings
import six
from mlxtend.classifier import StackingClassifier
import missingno as msno
from sklearn.ensemble import VotingClassifier
import shap
shap.initjs()
import lime
from lime import lime_tabular
warnings.simplefilter('ignore')
import os
plt.style.use('fivethirtyeight')
plt.style.use('dark_background')
insurance_df = pd.read_csv('./raw_data/train.csv', index_col='Id')
insurance_df.head()
| Product_Info_1 | Product_Info_2 | Product_Info_3 | Product_Info_4 | Product_Info_5 | Product_Info_6 | Product_Info_7 | Ins_Age | Ht | Wt | ... | Medical_Keyword_40 | Medical_Keyword_41 | Medical_Keyword_42 | Medical_Keyword_43 | Medical_Keyword_44 | Medical_Keyword_45 | Medical_Keyword_46 | Medical_Keyword_47 | Medical_Keyword_48 | Response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Id | |||||||||||||||||||||
| 2 | 1 | D3 | 10 | 0.076923 | 2 | 1 | 1 | 0.641791 | 0.581818 | 0.148536 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 |
| 5 | 1 | A1 | 26 | 0.076923 | 2 | 3 | 1 | 0.059701 | 0.600000 | 0.131799 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
| 6 | 1 | E1 | 26 | 0.076923 | 2 | 3 | 1 | 0.029851 | 0.745455 | 0.288703 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 |
| 7 | 1 | D4 | 10 | 0.487179 | 2 | 3 | 1 | 0.164179 | 0.672727 | 0.205021 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 |
| 8 | 1 | D2 | 26 | 0.230769 | 2 | 3 | 1 | 0.417910 | 0.654545 | 0.234310 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 |
5 rows × 127 columns
insurance_df.shape
(59381, 127)
insurance_df['Response'].value_counts()
8 19489 6 11233 7 8027 2 6552 1 6207 5 5432 4 1428 3 1013 Name: Response, dtype: int64
Class imbalance can be seen here. Also there 8 categories, lets combine them to 3 categories
sns.countplot(x=insurance_df['Response']);
Response 8 has highest values and 3 has the least
#Combining the Categores to 3 categories
insurance_df['Modified_Response'] = insurance_df['Response'].apply(lambda x : 0 if x<=7 and x>=0 else (1 if x==8 else -1))
sns.countplot(x= insurance_df['Modified_Response']);
Still some imbalance can be seen
# Dropping old response columns
insurance_df.drop('Response',axis = 1, inplace=True)
# Making lists with categorical and numerical features.
categorical = [col for col in insurance_df.columns if insurance_df[col].dtype =='object']
numerical = categorical = [col for col in insurance_df.columns if insurance_df[col].dtype !='object']
# Doing count plots for categorical
for col in categorical:
counts = insurance_df[col].value_counts().sort_index()
if len(counts) > 10 and len(counts) < 50 :
fig = plt.figure(figsize=(30, 10))
elif len(counts) >50 :
continue
else:
fig = plt.figure(figsize=(9, 6))
ax = fig.gca()
counts.plot.bar(ax = ax, color='steelblue')
ax.set_title(col + ' counts')
ax.set_xlabel(col)
ax.set_ylabel("Frequency")
plt.show()
D3 has the highest frequencies
Most of the features here are unbalanced.
fig, axes = plt.subplots(1,2,figsize=(10,5))
sns.distplot(insurance_df['Employment_Info_1'], ax=axes[0])
sns.boxplot(insurance_df['Employment_Info_1'], ax=axes[1])
<AxesSubplot:xlabel='Employment_Info_1'>
Right skewed.
Outliers can be seen.
fig, axes = plt.subplots(1,2,figsize=(10,5))
sns.distplot(insurance_df['Employment_Info_4'], ax=axes[0])
sns.boxplot(insurance_df['Employment_Info_4'], ax=axes[1])
<AxesSubplot:xlabel='Employment_Info_4'>
fig, axes = plt.subplots(1,2,figsize=(10,5))
sns.distplot(insurance_df['Employment_Info_6'], ax=axes[0])
sns.boxplot(insurance_df['Employment_Info_6'], ax=axes[1])
<AxesSubplot:xlabel='Employment_Info_6'>
fig, axes = plt.subplots(1,2,figsize=(10,5))
sns.distplot(insurance_df['Family_Hist_4'], ax=axes[0])
sns.boxplot(insurance_df['Family_Hist_4'], ax=axes[1])
<AxesSubplot:xlabel='Family_Hist_4'>
# I just checked correlated feature with greater than .8 here
corr = insurance_df.corr()
corr_greater_than_80 = corr[corr>=.8]
corr_greater_than_80
| Product_Info_1 | Product_Info_3 | Product_Info_4 | Product_Info_5 | Product_Info_6 | Product_Info_7 | Ins_Age | Ht | Wt | BMI | ... | Medical_Keyword_40 | Medical_Keyword_41 | Medical_Keyword_42 | Medical_Keyword_43 | Medical_Keyword_44 | Medical_Keyword_45 | Medical_Keyword_46 | Medical_Keyword_47 | Medical_Keyword_48 | Modified_Response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Product_Info_1 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Product_Info_3 | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Product_Info_4 | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Product_Info_5 | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Product_Info_6 | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| Medical_Keyword_45 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN |
| Medical_Keyword_46 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN |
| Medical_Keyword_47 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN |
| Medical_Keyword_48 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN |
| Modified_Response | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 |
126 rows × 126 columns
plt.figure(figsize=(12,8))
sns.heatmap(corr_greater_than_80, cmap="Reds");
BMI and Weight are highly correlated, which makes sense also as these 2 features are directly proprtional.
Ins_Age and Family_Hist_4, Family_Hist_2 highly correlated
Although, I am not going to perform any transformation on any feature or drop any as these are tree based models and they don't get affected by correlation much because of their non parametric nature.
#setting max columns to 200
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)
#checking percentage of missing values in a column
missing_val_count_by_column = insurance_df.isnull().sum()/len(insurance_df)
print(missing_val_count_by_column[missing_val_count_by_column > 0.4].sort_values(ascending=False))
Medical_History_10 0.990620 Medical_History_32 0.981358 Medical_History_24 0.935990 Medical_History_15 0.751015 Family_Hist_5 0.704114 Family_Hist_3 0.576632 Family_Hist_2 0.482579 Insurance_History_5 0.427679 dtype: float64
# Dropping all columns in which greater than 40 percent null values
insurance_df = insurance_df.dropna(thresh=insurance_df.shape[0]*0.4,how='all',axis=1)
# Does not contain important information
insurance_df.drop('Product_Info_2',axis=1,inplace=True)
# Data for all the independent variables
X = insurance_df.drop(labels='Modified_Response',axis=1)
# Data for the dependent variable
Y = insurance_df['Modified_Response']
# Filling remaining missing values with mean
X = X.fillna(X.mean())
# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.25, random_state=1)
# Check the shape of train dataset
print(X_train.shape,Y_train.shape)
# Check the shape of test dataset
print(X_test.shape, Y_test.shape)
(44535, 120) (44535,) (14846, 120) (14846,)
# Utility Functions
def check_scores(model, X_train, X_test ):
# Making predictions on train and test data
train_class_preds = model.predict(X_train)
test_class_preds = model.predict(X_test)
# Get the probabilities on train and test
train_preds = model.predict_proba(X_train)[:,1]
test_preds = model.predict_proba(X_test)[:,1]
# Calculating accuracy on train and test
train_accuracy = accuracy_score(Y_train,train_class_preds)
test_accuracy = accuracy_score(Y_test,test_class_preds)
print("The accuracy on train dataset is", train_accuracy)
print("The accuracy on test dataset is", test_accuracy)
print()
# Get the confusion matrices for train and test
train_cm = confusion_matrix(Y_train,train_class_preds)
test_cm = confusion_matrix(Y_test,test_class_preds )
print('Train confusion matrix:')
print( train_cm)
print()
print('Test confusion matrix:')
print(test_cm)
print()
# Get the roc_auc score for train and test dataset
train_auc = roc_auc_score(Y_train,train_preds)
test_auc = roc_auc_score(Y_test,test_preds)
print('ROC on train data:', train_auc)
print('ROC on test data:', test_auc)
# Fscore, precision and recall on test data
f1 = f1_score(Y_test, test_class_preds)
precision = precision_score(Y_test, test_class_preds)
recall = recall_score(Y_test, test_class_preds)
#R2 score on train and test data
train_log = log_loss(Y_train,train_preds)
test_log = log_loss(Y_test, test_preds)
print()
print('Train log loss:', train_log)
print('Test log loss:', test_log)
print()
print("F score is:",f1 )
print("Precision is:",precision)
print("Recall is:", recall)
return model, train_auc, test_auc, train_accuracy, test_accuracy,f1, precision,recall, train_log, test_log
def check_importance(model, X_train):
#Checking importance of features
importances = model.feature_importances_
#List of columns and their importances
importance_dict = {'Feature' : list(X_train.columns),
'Feature Importance' : importances}
#Creating a dataframe
importance_df = pd.DataFrame(importance_dict)
#Rounding it off to 2 digits as we might get exponential numbers
importance_df['Feature Importance'] = round(importance_df['Feature Importance'],2)
return importance_df.sort_values(by=['Feature Importance'],ascending=False)
def grid_search(model, parameters, X_train, Y_train):
#Doing a grid
grid = GridSearchCV(estimator=model,
param_grid = parameters,
cv = 2, verbose=2, scoring='roc_auc')
#Fitting the grid
grid.fit(X_train,Y_train)
print()
print()
# Best model found using grid search
optimal_model = grid.best_estimator_
print('Best parameters are: ')
pprint( grid.best_params_)
return optimal_model
# This function will show how a feature is pushing towards 0 or 1
def interpret_with_lime(model, X_test):
# New data
interpretor = lime_tabular.LimeTabularExplainer(
training_data=np.array(X_train),
feature_names=X_train.columns,
mode='classification')
exp = interpretor.explain_instance(
data_row=X_test.iloc[10],
predict_fn=model.predict_proba
)
exp.show_in_notebook(show_table=True)
# This gives feature importance
def plot_feature_importance(model, X_train):
# PLotting features vs their importance factors
fig = plt.figure(figsize = (15, 8))
# Extracting importance values
values =check_importance(model, X_train)[check_importance(model, X_train)['Feature Importance']>0]['Feature Importance'].values
# Extracting importance features
features = check_importance(model, X_train)[check_importance(model, X_train)['Feature Importance']>0]['Feature'].values
plt.bar(features, values, color ='blue',
width = 0.4)
plt.xticks( rotation='vertical')
plt.show()
# Number of trees
n_estimators = [50,80,100]
# Maximum depth of trees
max_depth = [4,6,8]
# Minimum number of samples required to split a node
min_samples_split = [50,100,150]
# Minimum number of samples required at each leaf node
min_samples_leaf = [40,50]
# Hyperparameter Grid
rf_parameters = {'n_estimators' : n_estimators,
'max_depth' : max_depth,
'min_samples_split' : min_samples_split,
'min_samples_leaf' : min_samples_leaf}
pprint(rf_parameters)
#finding the best model
rf_optimal_model = grid_search(RandomForestClassifier(), rf_parameters, X_train, Y_train)
{'max_depth': [4, 6, 8],
'min_samples_leaf': [40, 50],
'min_samples_split': [50, 100, 150],
'n_estimators': [50, 80, 100]}
Fitting 2 folds for each of 54 candidates, totalling 108 fits
[CV] END max_depth=4, min_samples_leaf=40, min_samples_split=50, n_estimators=50; total time= 0.8s
[CV] END max_depth=4, min_samples_leaf=40, min_samples_split=50, n_estimators=50; total time= 0.8s
[CV] END max_depth=4, min_samples_leaf=40, min_samples_split=50, n_estimators=80; total time= 1.2s
[CV] END max_depth=4, min_samples_leaf=40, min_samples_split=50, n_estimators=80; total time= 1.3s
[CV] END max_depth=4, min_samples_leaf=40, min_samples_split=50, n_estimators=100; total time= 1.6s
[CV] END max_depth=4, min_samples_leaf=40, min_samples_split=50, n_estimators=100; total time= 2.7s
[CV] END max_depth=4, min_samples_leaf=40, min_samples_split=100, n_estimators=50; total time= 0.9s
[CV] END max_depth=4, min_samples_leaf=40, min_samples_split=100, n_estimators=50; total time= 0.8s
[CV] END max_depth=4, min_samples_leaf=40, min_samples_split=100, n_estimators=80; total time= 1.8s
[CV] END max_depth=4, min_samples_leaf=40, min_samples_split=100, n_estimators=80; total time= 1.8s
[CV] END max_depth=4, min_samples_leaf=40, min_samples_split=100, n_estimators=100; total time= 2.6s
[CV] END max_depth=4, min_samples_leaf=40, min_samples_split=100, n_estimators=100; total time= 1.8s
[CV] END max_depth=4, min_samples_leaf=40, min_samples_split=150, n_estimators=50; total time= 0.8s
[CV] END max_depth=4, min_samples_leaf=40, min_samples_split=150, n_estimators=50; total time= 0.8s
[CV] END max_depth=4, min_samples_leaf=40, min_samples_split=150, n_estimators=80; total time= 1.4s
[CV] END max_depth=4, min_samples_leaf=40, min_samples_split=150, n_estimators=80; total time= 1.4s
[CV] END max_depth=4, min_samples_leaf=40, min_samples_split=150, n_estimators=100; total time= 1.5s
[CV] END max_depth=4, min_samples_leaf=40, min_samples_split=150, n_estimators=100; total time= 1.6s
[CV] END max_depth=4, min_samples_leaf=50, min_samples_split=50, n_estimators=50; total time= 0.7s
[CV] END max_depth=4, min_samples_leaf=50, min_samples_split=50, n_estimators=50; total time= 0.7s
[CV] END max_depth=4, min_samples_leaf=50, min_samples_split=50, n_estimators=80; total time= 1.3s
[CV] END max_depth=4, min_samples_leaf=50, min_samples_split=50, n_estimators=80; total time= 1.3s
[CV] END max_depth=4, min_samples_leaf=50, min_samples_split=50, n_estimators=100; total time= 1.6s
[CV] END max_depth=4, min_samples_leaf=50, min_samples_split=50, n_estimators=100; total time= 1.6s
[CV] END max_depth=4, min_samples_leaf=50, min_samples_split=100, n_estimators=50; total time= 0.8s
[CV] END max_depth=4, min_samples_leaf=50, min_samples_split=100, n_estimators=50; total time= 0.8s
[CV] END max_depth=4, min_samples_leaf=50, min_samples_split=100, n_estimators=80; total time= 1.3s
[CV] END max_depth=4, min_samples_leaf=50, min_samples_split=100, n_estimators=80; total time= 1.4s
[CV] END max_depth=4, min_samples_leaf=50, min_samples_split=100, n_estimators=100; total time= 1.8s
[CV] END max_depth=4, min_samples_leaf=50, min_samples_split=100, n_estimators=100; total time= 1.5s
[CV] END max_depth=4, min_samples_leaf=50, min_samples_split=150, n_estimators=50; total time= 0.8s
[CV] END max_depth=4, min_samples_leaf=50, min_samples_split=150, n_estimators=50; total time= 0.8s
[CV] END max_depth=4, min_samples_leaf=50, min_samples_split=150, n_estimators=80; total time= 1.3s
[CV] END max_depth=4, min_samples_leaf=50, min_samples_split=150, n_estimators=80; total time= 1.3s
[CV] END max_depth=4, min_samples_leaf=50, min_samples_split=150, n_estimators=100; total time= 2.0s
[CV] END max_depth=4, min_samples_leaf=50, min_samples_split=150, n_estimators=100; total time= 2.2s
[CV] END max_depth=6, min_samples_leaf=40, min_samples_split=50, n_estimators=50; total time= 1.0s
[CV] END max_depth=6, min_samples_leaf=40, min_samples_split=50, n_estimators=50; total time= 1.1s
[CV] END max_depth=6, min_samples_leaf=40, min_samples_split=50, n_estimators=80; total time= 1.8s
[CV] END max_depth=6, min_samples_leaf=40, min_samples_split=50, n_estimators=80; total time= 1.8s
[CV] END max_depth=6, min_samples_leaf=40, min_samples_split=50, n_estimators=100; total time= 2.4s
[CV] END max_depth=6, min_samples_leaf=40, min_samples_split=50, n_estimators=100; total time= 2.2s
[CV] END max_depth=6, min_samples_leaf=40, min_samples_split=100, n_estimators=50; total time= 1.1s
[CV] END max_depth=6, min_samples_leaf=40, min_samples_split=100, n_estimators=50; total time= 1.1s
[CV] END max_depth=6, min_samples_leaf=40, min_samples_split=100, n_estimators=80; total time= 1.8s
[CV] END max_depth=6, min_samples_leaf=40, min_samples_split=100, n_estimators=80; total time= 1.7s
[CV] END max_depth=6, min_samples_leaf=40, min_samples_split=100, n_estimators=100; total time= 2.2s
[CV] END max_depth=6, min_samples_leaf=40, min_samples_split=100, n_estimators=100; total time= 2.3s
[CV] END max_depth=6, min_samples_leaf=40, min_samples_split=150, n_estimators=50; total time= 1.1s
[CV] END max_depth=6, min_samples_leaf=40, min_samples_split=150, n_estimators=50; total time= 1.1s
[CV] END max_depth=6, min_samples_leaf=40, min_samples_split=150, n_estimators=80; total time= 1.7s
[CV] END max_depth=6, min_samples_leaf=40, min_samples_split=150, n_estimators=80; total time= 1.6s
[CV] END max_depth=6, min_samples_leaf=40, min_samples_split=150, n_estimators=100; total time= 2.1s
[CV] END max_depth=6, min_samples_leaf=40, min_samples_split=150, n_estimators=100; total time= 2.1s
[CV] END max_depth=6, min_samples_leaf=50, min_samples_split=50, n_estimators=50; total time= 1.0s
[CV] END max_depth=6, min_samples_leaf=50, min_samples_split=50, n_estimators=50; total time= 1.0s
[CV] END max_depth=6, min_samples_leaf=50, min_samples_split=50, n_estimators=80; total time= 1.7s
[CV] END max_depth=6, min_samples_leaf=50, min_samples_split=50, n_estimators=80; total time= 1.7s
[CV] END max_depth=6, min_samples_leaf=50, min_samples_split=50, n_estimators=100; total time= 2.1s
[CV] END max_depth=6, min_samples_leaf=50, min_samples_split=50, n_estimators=100; total time= 2.0s
[CV] END max_depth=6, min_samples_leaf=50, min_samples_split=100, n_estimators=50; total time= 1.0s
[CV] END max_depth=6, min_samples_leaf=50, min_samples_split=100, n_estimators=50; total time= 1.0s
[CV] END max_depth=6, min_samples_leaf=50, min_samples_split=100, n_estimators=80; total time= 1.8s
[CV] END max_depth=6, min_samples_leaf=50, min_samples_split=100, n_estimators=80; total time= 1.8s
[CV] END max_depth=6, min_samples_leaf=50, min_samples_split=100, n_estimators=100; total time= 2.2s
[CV] END max_depth=6, min_samples_leaf=50, min_samples_split=100, n_estimators=100; total time= 2.3s
[CV] END max_depth=6, min_samples_leaf=50, min_samples_split=150, n_estimators=50; total time= 1.1s
[CV] END max_depth=6, min_samples_leaf=50, min_samples_split=150, n_estimators=50; total time= 1.0s
[CV] END max_depth=6, min_samples_leaf=50, min_samples_split=150, n_estimators=80; total time= 1.7s
[CV] END max_depth=6, min_samples_leaf=50, min_samples_split=150, n_estimators=80; total time= 2.0s
[CV] END max_depth=6, min_samples_leaf=50, min_samples_split=150, n_estimators=100; total time= 2.1s
[CV] END max_depth=6, min_samples_leaf=50, min_samples_split=150, n_estimators=100; total time= 2.1s
[CV] END max_depth=8, min_samples_leaf=40, min_samples_split=50, n_estimators=50; total time= 1.3s
[CV] END max_depth=8, min_samples_leaf=40, min_samples_split=50, n_estimators=50; total time= 1.3s
[CV] END max_depth=8, min_samples_leaf=40, min_samples_split=50, n_estimators=80; total time= 2.1s
[CV] END max_depth=8, min_samples_leaf=40, min_samples_split=50, n_estimators=80; total time= 2.1s
[CV] END max_depth=8, min_samples_leaf=40, min_samples_split=50, n_estimators=100; total time= 2.6s
[CV] END max_depth=8, min_samples_leaf=40, min_samples_split=50, n_estimators=100; total time= 2.6s
[CV] END max_depth=8, min_samples_leaf=40, min_samples_split=100, n_estimators=50; total time= 1.3s
[CV] END max_depth=8, min_samples_leaf=40, min_samples_split=100, n_estimators=50; total time= 1.3s
[CV] END max_depth=8, min_samples_leaf=40, min_samples_split=100, n_estimators=80; total time= 2.1s
[CV] END max_depth=8, min_samples_leaf=40, min_samples_split=100, n_estimators=80; total time= 2.0s
[CV] END max_depth=8, min_samples_leaf=40, min_samples_split=100, n_estimators=100; total time= 2.6s
[CV] END max_depth=8, min_samples_leaf=40, min_samples_split=100, n_estimators=100; total time= 2.6s
[CV] END max_depth=8, min_samples_leaf=40, min_samples_split=150, n_estimators=50; total time= 1.3s
[CV] END max_depth=8, min_samples_leaf=40, min_samples_split=150, n_estimators=50; total time= 1.3s
[CV] END max_depth=8, min_samples_leaf=40, min_samples_split=150, n_estimators=80; total time= 2.0s
[CV] END max_depth=8, min_samples_leaf=40, min_samples_split=150, n_estimators=80; total time= 2.1s
[CV] END max_depth=8, min_samples_leaf=40, min_samples_split=150, n_estimators=100; total time= 2.6s
[CV] END max_depth=8, min_samples_leaf=40, min_samples_split=150, n_estimators=100; total time= 2.6s
[CV] END max_depth=8, min_samples_leaf=50, min_samples_split=50, n_estimators=50; total time= 1.2s
[CV] END max_depth=8, min_samples_leaf=50, min_samples_split=50, n_estimators=50; total time= 1.3s
[CV] END max_depth=8, min_samples_leaf=50, min_samples_split=50, n_estimators=80; total time= 2.1s
[CV] END max_depth=8, min_samples_leaf=50, min_samples_split=50, n_estimators=80; total time= 2.1s
[CV] END max_depth=8, min_samples_leaf=50, min_samples_split=50, n_estimators=100; total time= 2.8s
[CV] END max_depth=8, min_samples_leaf=50, min_samples_split=50, n_estimators=100; total time= 2.6s
[CV] END max_depth=8, min_samples_leaf=50, min_samples_split=100, n_estimators=50; total time= 1.3s
[CV] END max_depth=8, min_samples_leaf=50, min_samples_split=100, n_estimators=50; total time= 1.3s
[CV] END max_depth=8, min_samples_leaf=50, min_samples_split=100, n_estimators=80; total time= 2.1s
[CV] END max_depth=8, min_samples_leaf=50, min_samples_split=100, n_estimators=80; total time= 2.1s
[CV] END max_depth=8, min_samples_leaf=50, min_samples_split=100, n_estimators=100; total time= 2.6s
[CV] END max_depth=8, min_samples_leaf=50, min_samples_split=100, n_estimators=100; total time= 2.6s
[CV] END max_depth=8, min_samples_leaf=50, min_samples_split=150, n_estimators=50; total time= 1.2s
[CV] END max_depth=8, min_samples_leaf=50, min_samples_split=150, n_estimators=50; total time= 1.3s
[CV] END max_depth=8, min_samples_leaf=50, min_samples_split=150, n_estimators=80; total time= 2.3s
[CV] END max_depth=8, min_samples_leaf=50, min_samples_split=150, n_estimators=80; total time= 2.0s
[CV] END max_depth=8, min_samples_leaf=50, min_samples_split=150, n_estimators=100; total time= 2.6s
[CV] END max_depth=8, min_samples_leaf=50, min_samples_split=150, n_estimators=100; total time= 2.5s
Best parameters are:
{'max_depth': 8,
'min_samples_leaf': 40,
'min_samples_split': 150,
'n_estimators': 100}
# Getting scores from all the metrices
rf_model, rf_train_auc, rf_test_auc, rf_train_accuracy, rf_test_accuracy,rf_f1, rf_precision,rf_recall,rf_train_log, rf_test_log = check_scores(rf_optimal_model, X_train, X_test )
The accuracy on train dataset is 0.8103065005052206 The accuracy on test dataset is 0.8061430688400916 Train confusion matrix: [[27190 2717] [ 5731 8897]] Test confusion matrix: [[9077 908] [1970 2891]] ROC on train data: 0.8921542046043217 ROC on test data: 0.8862132944324943 Train log loss: 0.42438779691098094 Test log loss: 0.42871149664419084 F score is: 0.6676674364896074 Precision is: 0.7609897341405633 Recall is: 0.594733593910718
# Getting the feature importance for all the features
check_importance(rf_model, X_train)
| Feature | Feature Importance | |
|---|---|---|
| 9 | BMI | 0.24 |
| 8 | Wt | 0.19 |
| 38 | Medical_History_4 | 0.11 |
| 86 | Medical_Keyword_15 | 0.11 |
| 55 | Medical_History_23 | 0.09 |
| 6 | Ins_Age | 0.04 |
| 2 | Product_Info_4 | 0.03 |
| 21 | InsuredInfo_6 | 0.03 |
| 34 | Family_Hist_4 | 0.02 |
| 74 | Medical_Keyword_3 | 0.02 |
| 40 | Medical_History_6 | 0.01 |
| 63 | Medical_History_33 | 0.01 |
| 7 | Ht | 0.01 |
| 69 | Medical_History_39 | 0.01 |
| 35 | Medical_History_1 | 0.01 |
| 48 | Medical_History_16 | 0.01 |
| 32 | Family_Hist_2 | 0.01 |
| 94 | Medical_Keyword_23 | 0.01 |
| 85 | Medical_Keyword_14 | 0.00 |
| 90 | Medical_Keyword_19 | 0.00 |
| 87 | Medical_Keyword_16 | 0.00 |
| 88 | Medical_Keyword_17 | 0.00 |
| 89 | Medical_Keyword_18 | 0.00 |
| 84 | Medical_Keyword_13 | 0.00 |
| 79 | Medical_Keyword_8 | 0.00 |
| 83 | Medical_Keyword_12 | 0.00 |
| 82 | Medical_Keyword_11 | 0.00 |
| 81 | Medical_Keyword_10 | 0.00 |
| 80 | Medical_Keyword_9 | 0.00 |
| 78 | Medical_Keyword_7 | 0.00 |
| 77 | Medical_Keyword_6 | 0.00 |
| 76 | Medical_Keyword_5 | 0.00 |
| 75 | Medical_Keyword_4 | 0.00 |
| 73 | Medical_Keyword_2 | 0.00 |
| 72 | Medical_Keyword_1 | 0.00 |
| 71 | Medical_History_41 | 0.00 |
| 70 | Medical_History_40 | 0.00 |
| 91 | Medical_Keyword_20 | 0.00 |
| 0 | Product_Info_1 | 0.00 |
| 92 | Medical_Keyword_21 | 0.00 |
| 93 | Medical_Keyword_22 | 0.00 |
| 118 | Medical_Keyword_47 | 0.00 |
| 117 | Medical_Keyword_46 | 0.00 |
| 116 | Medical_Keyword_45 | 0.00 |
| 115 | Medical_Keyword_44 | 0.00 |
| 114 | Medical_Keyword_43 | 0.00 |
| 113 | Medical_Keyword_42 | 0.00 |
| 112 | Medical_Keyword_41 | 0.00 |
| 111 | Medical_Keyword_40 | 0.00 |
| 110 | Medical_Keyword_39 | 0.00 |
| 109 | Medical_Keyword_38 | 0.00 |
| 108 | Medical_Keyword_37 | 0.00 |
| 107 | Medical_Keyword_36 | 0.00 |
| 106 | Medical_Keyword_35 | 0.00 |
| 105 | Medical_Keyword_34 | 0.00 |
| 104 | Medical_Keyword_33 | 0.00 |
| 103 | Medical_Keyword_32 | 0.00 |
| 102 | Medical_Keyword_31 | 0.00 |
| 101 | Medical_Keyword_30 | 0.00 |
| 100 | Medical_Keyword_29 | 0.00 |
| 99 | Medical_Keyword_28 | 0.00 |
| 98 | Medical_Keyword_27 | 0.00 |
| 97 | Medical_Keyword_26 | 0.00 |
| 96 | Medical_Keyword_25 | 0.00 |
| 95 | Medical_Keyword_24 | 0.00 |
| 67 | Medical_History_37 | 0.00 |
| 68 | Medical_History_38 | 0.00 |
| 60 | Medical_History_29 | 0.00 |
| 66 | Medical_History_36 | 0.00 |
| 65 | Medical_History_35 | 0.00 |
| 30 | Insurance_History_9 | 0.00 |
| 29 | Insurance_History_8 | 0.00 |
| 28 | Insurance_History_7 | 0.00 |
| 27 | Insurance_History_5 | 0.00 |
| 26 | Insurance_History_4 | 0.00 |
| 25 | Insurance_History_3 | 0.00 |
| 24 | Insurance_History_2 | 0.00 |
| 23 | Insurance_History_1 | 0.00 |
| 22 | InsuredInfo_7 | 0.00 |
| 20 | InsuredInfo_5 | 0.00 |
| 19 | InsuredInfo_4 | 0.00 |
| 18 | InsuredInfo_3 | 0.00 |
| 17 | InsuredInfo_2 | 0.00 |
| 16 | InsuredInfo_1 | 0.00 |
| 15 | Employment_Info_6 | 0.00 |
| 14 | Employment_Info_5 | 0.00 |
| 13 | Employment_Info_4 | 0.00 |
| 12 | Employment_Info_3 | 0.00 |
| 11 | Employment_Info_2 | 0.00 |
| 10 | Employment_Info_1 | 0.00 |
| 5 | Product_Info_7 | 0.00 |
| 4 | Product_Info_6 | 0.00 |
| 3 | Product_Info_5 | 0.00 |
| 31 | Family_Hist_1 | 0.00 |
| 33 | Family_Hist_3 | 0.00 |
| 36 | Medical_History_2 | 0.00 |
| 52 | Medical_History_20 | 0.00 |
| 64 | Medical_History_34 | 0.00 |
| 62 | Medical_History_31 | 0.00 |
| 61 | Medical_History_30 | 0.00 |
| 1 | Product_Info_3 | 0.00 |
| 59 | Medical_History_28 | 0.00 |
| 58 | Medical_History_27 | 0.00 |
| 57 | Medical_History_26 | 0.00 |
| 56 | Medical_History_25 | 0.00 |
| 54 | Medical_History_22 | 0.00 |
| 53 | Medical_History_21 | 0.00 |
| 51 | Medical_History_19 | 0.00 |
| 37 | Medical_History_3 | 0.00 |
| 50 | Medical_History_18 | 0.00 |
| 49 | Medical_History_17 | 0.00 |
| 47 | Medical_History_14 | 0.00 |
| 46 | Medical_History_13 | 0.00 |
| 45 | Medical_History_12 | 0.00 |
| 44 | Medical_History_11 | 0.00 |
| 43 | Medical_History_9 | 0.00 |
| 42 | Medical_History_8 | 0.00 |
| 41 | Medical_History_7 | 0.00 |
| 39 | Medical_History_5 | 0.00 |
| 119 | Medical_Keyword_48 | 0.00 |
# PLotting only those features which are contributing something
plot_feature_importance(rf_model, X_train)
BMI, weight, Medical_History_23, Medical_History_4 and Medical_Keyword_15 seems to be important features according to random forest.
Also, only these features are contributing to the model prediction. Some features can be elmininated which are not contributing on further investigation.
# Interpretting the model using lime
interpret_with_lime(rf_model,X_test)
# Interpretting the model using shaply
X_shap=X_train
rf_explainer = shap.TreeExplainer(rf_model)
rf_shap_values = rf_explainer.shap_values(X_shap)
shap.summary_plot(rf_shap_values, X_shap, plot_type="bar")
Medical keyword 15,medical history 9, Wt, medical history 3 all pushing towards 1.
Orange ones are pusing towards 1.
# Plotting for top 5 features
top_vars = ['BMI','Medical_Keyword_15','Medical_History_4','Wt','Medical_History_23']
index_top_vars =[list(X_train.columns).index(var) for var in top_vars]
for elem in index_top_vars:
shap.dependence_plot(elem, rf_shap_values[0], X_train)
With high medical history 23 and low bmi we get class 1
#finding the best model
gb_parameters ={
"n_estimators":[5,50,250],
"max_depth":[1,3,5,7],
"learning_rate":[0.01,0.1,1]
}
pprint(gb_parameters)
gb_optimal_model = grid_search(GradientBoostingClassifier(), gb_parameters, X_train, Y_train)
{'learning_rate': [0.01, 0.1, 1],
'max_depth': [1, 3, 5, 7],
'n_estimators': [5, 50, 250]}
Fitting 2 folds for each of 36 candidates, totalling 72 fits
[CV] END ....learning_rate=0.01, max_depth=1, n_estimators=5; total time= 0.2s
[CV] END ....learning_rate=0.01, max_depth=1, n_estimators=5; total time= 0.2s
[CV] END ...learning_rate=0.01, max_depth=1, n_estimators=50; total time= 1.6s
[CV] END ...learning_rate=0.01, max_depth=1, n_estimators=50; total time= 1.6s
[CV] END ..learning_rate=0.01, max_depth=1, n_estimators=250; total time= 8.0s
[CV] END ..learning_rate=0.01, max_depth=1, n_estimators=250; total time= 8.0s
[CV] END ....learning_rate=0.01, max_depth=3, n_estimators=5; total time= 0.5s
[CV] END ....learning_rate=0.01, max_depth=3, n_estimators=5; total time= 0.5s
[CV] END ...learning_rate=0.01, max_depth=3, n_estimators=50; total time= 4.7s
[CV] END ...learning_rate=0.01, max_depth=3, n_estimators=50; total time= 4.7s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=250; total time= 23.3s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=250; total time= 23.9s
[CV] END ....learning_rate=0.01, max_depth=5, n_estimators=5; total time= 0.8s
[CV] END ....learning_rate=0.01, max_depth=5, n_estimators=5; total time= 0.8s
[CV] END ...learning_rate=0.01, max_depth=5, n_estimators=50; total time= 8.5s
[CV] END ...learning_rate=0.01, max_depth=5, n_estimators=50; total time= 8.4s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=250; total time= 42.7s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=250; total time= 42.0s
[CV] END ....learning_rate=0.01, max_depth=7, n_estimators=5; total time= 1.2s
[CV] END ....learning_rate=0.01, max_depth=7, n_estimators=5; total time= 1.7s
[CV] END ...learning_rate=0.01, max_depth=7, n_estimators=50; total time= 13.5s
[CV] END ...learning_rate=0.01, max_depth=7, n_estimators=50; total time= 12.5s
[CV] END ..learning_rate=0.01, max_depth=7, n_estimators=250; total time= 1.1min
[CV] END ..learning_rate=0.01, max_depth=7, n_estimators=250; total time= 1.1min
[CV] END .....learning_rate=0.1, max_depth=1, n_estimators=5; total time= 0.2s
[CV] END .....learning_rate=0.1, max_depth=1, n_estimators=5; total time= 0.2s
[CV] END ....learning_rate=0.1, max_depth=1, n_estimators=50; total time= 1.6s
[CV] END ....learning_rate=0.1, max_depth=1, n_estimators=50; total time= 1.6s
[CV] END ...learning_rate=0.1, max_depth=1, n_estimators=250; total time= 8.1s
[CV] END ...learning_rate=0.1, max_depth=1, n_estimators=250; total time= 8.1s
[CV] END .....learning_rate=0.1, max_depth=3, n_estimators=5; total time= 0.5s
[CV] END .....learning_rate=0.1, max_depth=3, n_estimators=5; total time= 0.5s
[CV] END ....learning_rate=0.1, max_depth=3, n_estimators=50; total time= 4.8s
[CV] END ....learning_rate=0.1, max_depth=3, n_estimators=50; total time= 4.8s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=250; total time= 23.4s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=250; total time= 23.8s
[CV] END .....learning_rate=0.1, max_depth=5, n_estimators=5; total time= 0.9s
[CV] END .....learning_rate=0.1, max_depth=5, n_estimators=5; total time= 0.9s
[CV] END ....learning_rate=0.1, max_depth=5, n_estimators=50; total time= 8.5s
[CV] END ....learning_rate=0.1, max_depth=5, n_estimators=50; total time= 8.4s
[CV] END ...learning_rate=0.1, max_depth=5, n_estimators=250; total time= 40.5s
[CV] END ...learning_rate=0.1, max_depth=5, n_estimators=250; total time= 39.8s
[CV] END .....learning_rate=0.1, max_depth=7, n_estimators=5; total time= 1.3s
[CV] END .....learning_rate=0.1, max_depth=7, n_estimators=5; total time= 1.3s
[CV] END ....learning_rate=0.1, max_depth=7, n_estimators=50; total time= 12.9s
[CV] END ....learning_rate=0.1, max_depth=7, n_estimators=50; total time= 12.5s
[CV] END ...learning_rate=0.1, max_depth=7, n_estimators=250; total time= 59.3s
[CV] END ...learning_rate=0.1, max_depth=7, n_estimators=250; total time= 57.9s
[CV] END .......learning_rate=1, max_depth=1, n_estimators=5; total time= 0.2s
[CV] END .......learning_rate=1, max_depth=1, n_estimators=5; total time= 0.2s
[CV] END ......learning_rate=1, max_depth=1, n_estimators=50; total time= 1.6s
[CV] END ......learning_rate=1, max_depth=1, n_estimators=50; total time= 1.6s
[CV] END .....learning_rate=1, max_depth=1, n_estimators=250; total time= 8.2s
[CV] END .....learning_rate=1, max_depth=1, n_estimators=250; total time= 8.0s
[CV] END .......learning_rate=1, max_depth=3, n_estimators=5; total time= 0.5s
[CV] END .......learning_rate=1, max_depth=3, n_estimators=5; total time= 0.5s
[CV] END ......learning_rate=1, max_depth=3, n_estimators=50; total time= 4.7s
[CV] END ......learning_rate=1, max_depth=3, n_estimators=50; total time= 4.8s
[CV] END .....learning_rate=1, max_depth=3, n_estimators=250; total time= 23.5s
[CV] END .....learning_rate=1, max_depth=3, n_estimators=250; total time= 23.3s
[CV] END .......learning_rate=1, max_depth=5, n_estimators=5; total time= 0.8s
[CV] END .......learning_rate=1, max_depth=5, n_estimators=5; total time= 0.8s
[CV] END ......learning_rate=1, max_depth=5, n_estimators=50; total time= 8.1s
[CV] END ......learning_rate=1, max_depth=5, n_estimators=50; total time= 8.1s
[CV] END .....learning_rate=1, max_depth=5, n_estimators=250; total time= 41.0s
[CV] END .....learning_rate=1, max_depth=5, n_estimators=250; total time= 39.8s
[CV] END .......learning_rate=1, max_depth=7, n_estimators=5; total time= 1.2s
[CV] END .......learning_rate=1, max_depth=7, n_estimators=5; total time= 1.2s
[CV] END ......learning_rate=1, max_depth=7, n_estimators=50; total time= 11.8s
[CV] END ......learning_rate=1, max_depth=7, n_estimators=50; total time= 11.6s
[CV] END .....learning_rate=1, max_depth=7, n_estimators=250; total time= 1.0min
[CV] END .....learning_rate=1, max_depth=7, n_estimators=250; total time= 1.1min
Best parameters are:
{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 250}
# Getting the scpres for all the score metrics used here
gb_model, gb_train_auc, gb_test_auc, gb_train_accuracy, gb_test_accuracy,gb_f1, gb_precision,gb_recall,gb_train_log, gb_test_log = check_scores(gb_optimal_model, X_train, X_test )
The accuracy on train dataset is 0.8644886044683956 The accuracy on test dataset is 0.8337599353361175 Train confusion matrix: [[26630 3277] [ 2758 11870]] Test confusion matrix: [[8725 1260] [1208 3653]] ROC on train data: 0.9378223036029318 ROC on test data: 0.9092139546493161 Train log loss: 0.3033252862180813 Test log loss: 0.35390893670516516 F score is: 0.7474933497032945 Precision is: 0.7435375534296764 Recall is: 0.7514914626620037
# Getting feature importance
check_importance(gb_model, X_train)
| Feature | Feature Importance | |
|---|---|---|
| 9 | BMI | 0.39 |
| 55 | Medical_History_23 | 0.14 |
| 38 | Medical_History_4 | 0.13 |
| 2 | Product_Info_4 | 0.05 |
| 86 | Medical_Keyword_15 | 0.03 |
| 6 | Ins_Age | 0.03 |
| 8 | Wt | 0.03 |
| 74 | Medical_Keyword_3 | 0.02 |
| 34 | Family_Hist_4 | 0.01 |
| 33 | Family_Hist_3 | 0.01 |
| 21 | InsuredInfo_6 | 0.01 |
| 31 | Family_Hist_1 | 0.01 |
| 32 | Family_Hist_2 | 0.01 |
| 15 | Employment_Info_6 | 0.01 |
| 61 | Medical_History_30 | 0.01 |
| 10 | Employment_Info_1 | 0.01 |
| 35 | Medical_History_1 | 0.01 |
| 36 | Medical_History_2 | 0.01 |
| 90 | Medical_Keyword_19 | 0.00 |
| 84 | Medical_Keyword_13 | 0.00 |
| 89 | Medical_Keyword_18 | 0.00 |
| 88 | Medical_Keyword_17 | 0.00 |
| 87 | Medical_Keyword_16 | 0.00 |
| 91 | Medical_Keyword_20 | 0.00 |
| 85 | Medical_Keyword_14 | 0.00 |
| 80 | Medical_Keyword_9 | 0.00 |
| 83 | Medical_Keyword_12 | 0.00 |
| 82 | Medical_Keyword_11 | 0.00 |
| 81 | Medical_Keyword_10 | 0.00 |
| 93 | Medical_Keyword_22 | 0.00 |
| 79 | Medical_Keyword_8 | 0.00 |
| 78 | Medical_Keyword_7 | 0.00 |
| 77 | Medical_Keyword_6 | 0.00 |
| 76 | Medical_Keyword_5 | 0.00 |
| 75 | Medical_Keyword_4 | 0.00 |
| 73 | Medical_Keyword_2 | 0.00 |
| 72 | Medical_Keyword_1 | 0.00 |
| 71 | Medical_History_41 | 0.00 |
| 70 | Medical_History_40 | 0.00 |
| 92 | Medical_Keyword_21 | 0.00 |
| 0 | Product_Info_1 | 0.00 |
| 94 | Medical_Keyword_23 | 0.00 |
| 107 | Medical_Keyword_36 | 0.00 |
| 118 | Medical_Keyword_47 | 0.00 |
| 117 | Medical_Keyword_46 | 0.00 |
| 116 | Medical_Keyword_45 | 0.00 |
| 115 | Medical_Keyword_44 | 0.00 |
| 114 | Medical_Keyword_43 | 0.00 |
| 113 | Medical_Keyword_42 | 0.00 |
| 112 | Medical_Keyword_41 | 0.00 |
| 111 | Medical_Keyword_40 | 0.00 |
| 110 | Medical_Keyword_39 | 0.00 |
| 109 | Medical_Keyword_38 | 0.00 |
| 108 | Medical_Keyword_37 | 0.00 |
| 106 | Medical_Keyword_35 | 0.00 |
| 95 | Medical_Keyword_24 | 0.00 |
| 105 | Medical_Keyword_34 | 0.00 |
| 104 | Medical_Keyword_33 | 0.00 |
| 103 | Medical_Keyword_32 | 0.00 |
| 102 | Medical_Keyword_31 | 0.00 |
| 101 | Medical_Keyword_30 | 0.00 |
| 100 | Medical_Keyword_29 | 0.00 |
| 99 | Medical_Keyword_28 | 0.00 |
| 98 | Medical_Keyword_27 | 0.00 |
| 97 | Medical_Keyword_26 | 0.00 |
| 68 | Medical_History_38 | 0.00 |
| 96 | Medical_Keyword_25 | 0.00 |
| 69 | Medical_History_39 | 0.00 |
| 60 | Medical_History_29 | 0.00 |
| 67 | Medical_History_37 | 0.00 |
| 19 | InsuredInfo_4 | 0.00 |
| 30 | Insurance_History_9 | 0.00 |
| 29 | Insurance_History_8 | 0.00 |
| 28 | Insurance_History_7 | 0.00 |
| 27 | Insurance_History_5 | 0.00 |
| 26 | Insurance_History_4 | 0.00 |
| 25 | Insurance_History_3 | 0.00 |
| 24 | Insurance_History_2 | 0.00 |
| 23 | Insurance_History_1 | 0.00 |
| 22 | InsuredInfo_7 | 0.00 |
| 20 | InsuredInfo_5 | 0.00 |
| 18 | InsuredInfo_3 | 0.00 |
| 39 | Medical_History_5 | 0.00 |
| 17 | InsuredInfo_2 | 0.00 |
| 16 | InsuredInfo_1 | 0.00 |
| 14 | Employment_Info_5 | 0.00 |
| 13 | Employment_Info_4 | 0.00 |
| 12 | Employment_Info_3 | 0.00 |
| 11 | Employment_Info_2 | 0.00 |
| 7 | Ht | 0.00 |
| 5 | Product_Info_7 | 0.00 |
| 4 | Product_Info_6 | 0.00 |
| 3 | Product_Info_5 | 0.00 |
| 37 | Medical_History_3 | 0.00 |
| 40 | Medical_History_6 | 0.00 |
| 66 | Medical_History_36 | 0.00 |
| 53 | Medical_History_21 | 0.00 |
| 65 | Medical_History_35 | 0.00 |
| 64 | Medical_History_34 | 0.00 |
| 63 | Medical_History_33 | 0.00 |
| 62 | Medical_History_31 | 0.00 |
| 1 | Product_Info_3 | 0.00 |
| 59 | Medical_History_28 | 0.00 |
| 58 | Medical_History_27 | 0.00 |
| 57 | Medical_History_26 | 0.00 |
| 56 | Medical_History_25 | 0.00 |
| 54 | Medical_History_22 | 0.00 |
| 52 | Medical_History_20 | 0.00 |
| 41 | Medical_History_7 | 0.00 |
| 51 | Medical_History_19 | 0.00 |
| 50 | Medical_History_18 | 0.00 |
| 49 | Medical_History_17 | 0.00 |
| 48 | Medical_History_16 | 0.00 |
| 47 | Medical_History_14 | 0.00 |
| 46 | Medical_History_13 | 0.00 |
| 45 | Medical_History_12 | 0.00 |
| 44 | Medical_History_11 | 0.00 |
| 43 | Medical_History_9 | 0.00 |
| 42 | Medical_History_8 | 0.00 |
| 119 | Medical_Keyword_48 | 0.00 |
# PLotting only those features which are contributing something
plot_feature_importance(gb_model, X_train)
BMI, weight, Medical_History_23, Medical_History_4 and Medical_Keyword_15 seems to be the most important 5 features according to Gradient boosting.
# Interpretting the model using lime
interpret_with_lime(gb_model,X_test)
# Interpretting the model using shaply
X_shap=X_train
gb_explainer = shap.TreeExplainer(gb_model)
gb_shap_values = gb_explainer.shap_values(X_shap)
shap.summary_plot(gb_shap_values, X_shap, plot_type="dot")
BMI is pushing models prediction towards 0.
Medical keyword 15 is pushing towards 1. However, medical keyword 4 is pushing towards 0.
Also, according to feature plot Wt. was in top 5 most important features, same isn't followed here.
#PLotting for top 5 features
top_vars = ['BMI','Medical_Keyword_15','Medical_History_4','Product_Info_4','Medical_History_23']
index_top_vars =[list(X_train.columns).index(var) for var in top_vars]
for elem in index_top_vars:
shap.dependence_plot(elem, gb_shap_values, X_train)
For low BMI and high medical history 23 we get class as 1.
# Parameter grid for xgboost
xgb_parameters = {'max_depth': [1,3,5], 'n_estimators': [2,5,10], 'learning_rate': [.01 , .1, .5]}
print('XGB parameters areL:')
pprint(xgb_parameters)
#finding the best model
xgb_optimal_model = grid_search(XGBClassifier(), xgb_parameters, X_train, Y_train)
XGB parameters areL:
{'learning_rate': [0.01, 0.1, 0.5],
'max_depth': [1, 3, 5],
'n_estimators': [2, 5, 10]}
Fitting 2 folds for each of 27 candidates, totalling 54 fits
[CV] END ....learning_rate=0.01, max_depth=1, n_estimators=2; total time= 0.3s
[CV] END ....learning_rate=0.01, max_depth=1, n_estimators=2; total time= 0.3s
[CV] END ....learning_rate=0.01, max_depth=1, n_estimators=5; total time= 0.3s
[CV] END ....learning_rate=0.01, max_depth=1, n_estimators=5; total time= 0.3s
[CV] END ...learning_rate=0.01, max_depth=1, n_estimators=10; total time= 0.4s
[CV] END ...learning_rate=0.01, max_depth=1, n_estimators=10; total time= 0.5s
[CV] END ....learning_rate=0.01, max_depth=3, n_estimators=2; total time= 0.3s
[CV] END ....learning_rate=0.01, max_depth=3, n_estimators=2; total time= 0.3s
[CV] END ....learning_rate=0.01, max_depth=3, n_estimators=5; total time= 0.4s
[CV] END ....learning_rate=0.01, max_depth=3, n_estimators=5; total time= 0.4s
[CV] END ...learning_rate=0.01, max_depth=3, n_estimators=10; total time= 0.6s
[CV] END ...learning_rate=0.01, max_depth=3, n_estimators=10; total time= 0.6s
[CV] END ....learning_rate=0.01, max_depth=5, n_estimators=2; total time= 0.4s
[CV] END ....learning_rate=0.01, max_depth=5, n_estimators=2; total time= 0.4s
[CV] END ....learning_rate=0.01, max_depth=5, n_estimators=5; total time= 0.5s
[CV] END ....learning_rate=0.01, max_depth=5, n_estimators=5; total time= 0.5s
[CV] END ...learning_rate=0.01, max_depth=5, n_estimators=10; total time= 0.8s
[CV] END ...learning_rate=0.01, max_depth=5, n_estimators=10; total time= 0.8s
[CV] END .....learning_rate=0.1, max_depth=1, n_estimators=2; total time= 0.3s
[CV] END .....learning_rate=0.1, max_depth=1, n_estimators=2; total time= 0.3s
[CV] END .....learning_rate=0.1, max_depth=1, n_estimators=5; total time= 0.3s
[CV] END .....learning_rate=0.1, max_depth=1, n_estimators=5; total time= 0.3s
[CV] END ....learning_rate=0.1, max_depth=1, n_estimators=10; total time= 0.4s
[CV] END ....learning_rate=0.1, max_depth=1, n_estimators=10; total time= 0.4s
[CV] END .....learning_rate=0.1, max_depth=3, n_estimators=2; total time= 0.3s
[CV] END .....learning_rate=0.1, max_depth=3, n_estimators=2; total time= 0.3s
[CV] END .....learning_rate=0.1, max_depth=3, n_estimators=5; total time= 0.4s
[CV] END .....learning_rate=0.1, max_depth=3, n_estimators=5; total time= 0.4s
[CV] END ....learning_rate=0.1, max_depth=3, n_estimators=10; total time= 0.6s
[CV] END ....learning_rate=0.1, max_depth=3, n_estimators=10; total time= 0.6s
[CV] END .....learning_rate=0.1, max_depth=5, n_estimators=2; total time= 0.4s
[CV] END .....learning_rate=0.1, max_depth=5, n_estimators=2; total time= 0.4s
[CV] END .....learning_rate=0.1, max_depth=5, n_estimators=5; total time= 0.6s
[CV] END .....learning_rate=0.1, max_depth=5, n_estimators=5; total time= 0.6s
[CV] END ....learning_rate=0.1, max_depth=5, n_estimators=10; total time= 0.8s
[CV] END ....learning_rate=0.1, max_depth=5, n_estimators=10; total time= 0.8s
[CV] END .....learning_rate=0.5, max_depth=1, n_estimators=2; total time= 0.3s
[CV] END .....learning_rate=0.5, max_depth=1, n_estimators=2; total time= 0.3s
[CV] END .....learning_rate=0.5, max_depth=1, n_estimators=5; total time= 0.3s
[CV] END .....learning_rate=0.5, max_depth=1, n_estimators=5; total time= 0.3s
[CV] END ....learning_rate=0.5, max_depth=1, n_estimators=10; total time= 0.4s
[CV] END ....learning_rate=0.5, max_depth=1, n_estimators=10; total time= 0.4s
[CV] END .....learning_rate=0.5, max_depth=3, n_estimators=2; total time= 0.4s
[CV] END .....learning_rate=0.5, max_depth=3, n_estimators=2; total time= 0.4s
[CV] END .....learning_rate=0.5, max_depth=3, n_estimators=5; total time= 0.4s
[CV] END .....learning_rate=0.5, max_depth=3, n_estimators=5; total time= 0.4s
[CV] END ....learning_rate=0.5, max_depth=3, n_estimators=10; total time= 0.6s
[CV] END ....learning_rate=0.5, max_depth=3, n_estimators=10; total time= 0.8s
[CV] END .....learning_rate=0.5, max_depth=5, n_estimators=2; total time= 0.4s
[CV] END .....learning_rate=0.5, max_depth=5, n_estimators=2; total time= 0.4s
[CV] END .....learning_rate=0.5, max_depth=5, n_estimators=5; total time= 0.6s
[CV] END .....learning_rate=0.5, max_depth=5, n_estimators=5; total time= 0.6s
[CV] END ....learning_rate=0.5, max_depth=5, n_estimators=10; total time= 0.8s
[CV] END ....learning_rate=0.5, max_depth=5, n_estimators=10; total time= 0.9s
Best parameters are:
{'learning_rate': 0.5, 'max_depth': 5, 'n_estimators': 10}
# Getting the scores for all the score metrics used here
xgb_model, xgb_train_auc, xgb_test_auc, xgb_train_accuracy, xgb_test_accuracy,xgb_f1, xgb_precision,xgb_recall,xgb_train_log, xgb_test_log= check_scores(xgb_optimal_model, X_train, X_test )
The accuracy on train dataset is 0.830672504771528 The accuracy on test dataset is 0.825272800754412 Train confusion matrix: [[25730 4177] [ 3364 11264]] Test confusion matrix: [[8566 1419] [1175 3686]] ROC on train data: 0.9070705299819286 ROC on test data: 0.9015875283816488 Train log loss: 0.36021934797456384 Test log loss: 0.368680357765126 F score is: 0.7397150311057596 Precision is: 0.7220372184133202 Recall is: 0.7582801892614688
# Getting feature importance
check_importance(xgb_model, X_train)
| Feature | Feature Importance | |
|---|---|---|
| 55 | Medical_History_23 | 0.19 |
| 9 | BMI | 0.15 |
| 38 | Medical_History_4 | 0.13 |
| 74 | Medical_Keyword_3 | 0.06 |
| 86 | Medical_Keyword_15 | 0.05 |
| 61 | Medical_History_30 | 0.05 |
| 46 | Medical_History_13 | 0.04 |
| 22 | InsuredInfo_7 | 0.04 |
| 21 | InsuredInfo_6 | 0.03 |
| 112 | Medical_Keyword_41 | 0.03 |
| 2 | Product_Info_4 | 0.03 |
| 39 | Medical_History_5 | 0.02 |
| 94 | Medical_Keyword_23 | 0.02 |
| 31 | Family_Hist_1 | 0.02 |
| 52 | Medical_History_20 | 0.01 |
| 59 | Medical_History_28 | 0.01 |
| 37 | Medical_History_3 | 0.01 |
| 69 | Medical_History_39 | 0.01 |
| 33 | Family_Hist_3 | 0.01 |
| 24 | Insurance_History_2 | 0.01 |
| 0 | Product_Info_1 | 0.01 |
| 70 | Medical_History_40 | 0.01 |
| 20 | InsuredInfo_5 | 0.01 |
| 6 | Ins_Age | 0.01 |
| 106 | Medical_Keyword_35 | 0.01 |
| 17 | InsuredInfo_2 | 0.01 |
| 78 | Medical_Keyword_7 | 0.00 |
| 85 | Medical_Keyword_14 | 0.00 |
| 84 | Medical_Keyword_13 | 0.00 |
| 83 | Medical_Keyword_12 | 0.00 |
| 115 | Medical_Keyword_44 | 0.00 |
| 82 | Medical_Keyword_11 | 0.00 |
| 81 | Medical_Keyword_10 | 0.00 |
| 80 | Medical_Keyword_9 | 0.00 |
| 79 | Medical_Keyword_8 | 0.00 |
| 116 | Medical_Keyword_45 | 0.00 |
| 87 | Medical_Keyword_16 | 0.00 |
| 77 | Medical_Keyword_6 | 0.00 |
| 117 | Medical_Keyword_46 | 0.00 |
| 76 | Medical_Keyword_5 | 0.00 |
| 75 | Medical_Keyword_4 | 0.00 |
| 118 | Medical_Keyword_47 | 0.00 |
| 73 | Medical_Keyword_2 | 0.00 |
| 72 | Medical_Keyword_1 | 0.00 |
| 71 | Medical_History_41 | 0.00 |
| 114 | Medical_Keyword_43 | 0.00 |
| 88 | Medical_Keyword_17 | 0.00 |
| 113 | Medical_Keyword_42 | 0.00 |
| 104 | Medical_Keyword_33 | 0.00 |
| 105 | Medical_Keyword_34 | 0.00 |
| 103 | Medical_Keyword_32 | 0.00 |
| 102 | Medical_Keyword_31 | 0.00 |
| 101 | Medical_Keyword_30 | 0.00 |
| 100 | Medical_Keyword_29 | 0.00 |
| 99 | Medical_Keyword_28 | 0.00 |
| 98 | Medical_Keyword_27 | 0.00 |
| 97 | Medical_Keyword_26 | 0.00 |
| 107 | Medical_Keyword_36 | 0.00 |
| 95 | Medical_Keyword_24 | 0.00 |
| 108 | Medical_Keyword_37 | 0.00 |
| 109 | Medical_Keyword_38 | 0.00 |
| 110 | Medical_Keyword_39 | 0.00 |
| 111 | Medical_Keyword_40 | 0.00 |
| 93 | Medical_Keyword_22 | 0.00 |
| 92 | Medical_Keyword_21 | 0.00 |
| 91 | Medical_Keyword_20 | 0.00 |
| 90 | Medical_Keyword_19 | 0.00 |
| 89 | Medical_Keyword_18 | 0.00 |
| 96 | Medical_Keyword_25 | 0.00 |
| 60 | Medical_History_29 | 0.00 |
| 68 | Medical_History_38 | 0.00 |
| 16 | InsuredInfo_1 | 0.00 |
| 32 | Family_Hist_2 | 0.00 |
| 30 | Insurance_History_9 | 0.00 |
| 29 | Insurance_History_8 | 0.00 |
| 28 | Insurance_History_7 | 0.00 |
| 27 | Insurance_History_5 | 0.00 |
| 26 | Insurance_History_4 | 0.00 |
| 25 | Insurance_History_3 | 0.00 |
| 23 | Insurance_History_1 | 0.00 |
| 19 | InsuredInfo_4 | 0.00 |
| 18 | InsuredInfo_3 | 0.00 |
| 15 | Employment_Info_6 | 0.00 |
| 67 | Medical_History_37 | 0.00 |
| 14 | Employment_Info_5 | 0.00 |
| 13 | Employment_Info_4 | 0.00 |
| 12 | Employment_Info_3 | 0.00 |
| 11 | Employment_Info_2 | 0.00 |
| 10 | Employment_Info_1 | 0.00 |
| 8 | Wt | 0.00 |
| 7 | Ht | 0.00 |
| 5 | Product_Info_7 | 0.00 |
| 4 | Product_Info_6 | 0.00 |
| 3 | Product_Info_5 | 0.00 |
| 34 | Family_Hist_4 | 0.00 |
| 35 | Medical_History_1 | 0.00 |
| 36 | Medical_History_2 | 0.00 |
| 40 | Medical_History_6 | 0.00 |
| 66 | Medical_History_36 | 0.00 |
| 65 | Medical_History_35 | 0.00 |
| 64 | Medical_History_34 | 0.00 |
| 63 | Medical_History_33 | 0.00 |
| 62 | Medical_History_31 | 0.00 |
| 1 | Product_Info_3 | 0.00 |
| 58 | Medical_History_27 | 0.00 |
| 57 | Medical_History_26 | 0.00 |
| 56 | Medical_History_25 | 0.00 |
| 54 | Medical_History_22 | 0.00 |
| 53 | Medical_History_21 | 0.00 |
| 51 | Medical_History_19 | 0.00 |
| 50 | Medical_History_18 | 0.00 |
| 49 | Medical_History_17 | 0.00 |
| 48 | Medical_History_16 | 0.00 |
| 47 | Medical_History_14 | 0.00 |
| 45 | Medical_History_12 | 0.00 |
| 44 | Medical_History_11 | 0.00 |
| 43 | Medical_History_9 | 0.00 |
| 42 | Medical_History_8 | 0.00 |
| 41 | Medical_History_7 | 0.00 |
| 119 | Medical_Keyword_48 | 0.00 |
Same trend is seen here.
They all are giving similar scores also so it could be that same features are contributing the most thus similar scores.
# Interpretting the model using shaply
xgb_explainer = shap.TreeExplainer(xgb_model)
xgb_shap_values = xgb_explainer.shap_values(X_shap)
shap.summary_plot(xgb_shap_values, X_shap, plot_type="dot")
Again BMI is pushing towards class 0.
MEdical history 4 pushing towards class 1.
#PLotting for top 5 features
top_vars = ['BMI','Medical_Keyword_15','Medical_History_4','Product_Info_4','Medical_History_23']
index_top_vars =[list(X_train.columns).index(var) for var in top_vars]
for elem in index_top_vars:
shap.dependence_plot(elem, xgb_shap_values, X_train)
For product info 4 and wt we see some interesting trend
# Parameter grid for Logistic Regression
solvers = ['lbfgs']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
lr_parameters = dict(solver=solvers,penalty=penalty,C=c_values)# define grid search
#finding the best model
lr_optimal_model = grid_search(LogisticRegression( max_iter=5000), lr_parameters, X_train, Y_train)
Fitting 2 folds for each of 5 candidates, totalling 10 fits
[CV] END ....................C=100, penalty=l2, solver=lbfgs; total time= 21.0s
[CV] END ....................C=100, penalty=l2, solver=lbfgs; total time= 20.2s
[CV] END .....................C=10, penalty=l2, solver=lbfgs; total time= 17.7s
[CV] END .....................C=10, penalty=l2, solver=lbfgs; total time= 17.8s
[CV] END ....................C=1.0, penalty=l2, solver=lbfgs; total time= 17.0s
[CV] END ....................C=1.0, penalty=l2, solver=lbfgs; total time= 15.8s
[CV] END ....................C=0.1, penalty=l2, solver=lbfgs; total time= 13.0s
[CV] END ....................C=0.1, penalty=l2, solver=lbfgs; total time= 12.0s
[CV] END ...................C=0.01, penalty=l2, solver=lbfgs; total time= 14.9s
[CV] END ...................C=0.01, penalty=l2, solver=lbfgs; total time= 5.4s
Best parameters are:
{'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
# Getting the scores for all the score metrics used here
lr_model, lr_train_auc, lr_test_auc, lr_train_accuracy, lr_test_accuracy,lr_f1, lr_precision, lr_recall,lr_train_log, lr_test_log = check_scores(lr_optimal_model, X_train, X_test )
The accuracy on train dataset is 0.8132255529358932 The accuracy on test dataset is 0.8102519197090126 Train confusion matrix: [[26094 3813] [ 4505 10123]] Test confusion matrix: [[8723 1262] [1555 3306]] ROC on train data: 0.8853056177733145 ROC on test data: 0.8811828934514712 Train log loss: 0.39564636143443066 Test log loss: 0.4011729163336869 F score is: 0.7012408526885141 Precision is: 0.7237302977232924 Recall is: 0.6801069738736886
# Making a dataframe with coefficients and the feature names respectively
importance_df_lr = pd.concat([ pd.DataFrame(data =((X_train.columns).values).reshape(-1,1), columns = ['Feature']), pd.DataFrame(data =np.round(lr_optimal_model.coef_,2).reshape(-1,1), columns = ['Feature Importance'])], axis=1 )
importance_df_lr.sort_values(by=['Feature Importance'],ascending=False, inplace = True)
importance_df_lr
| Feature | Feature Importance | |
|---|---|---|
| 112 | Medical_Keyword_41 | 1.86 |
| 7 | Ht | 1.69 |
| 33 | Family_Hist_3 | 1.29 |
| 38 | Medical_History_4 | 1.24 |
| 52 | Medical_History_20 | 1.23 |
| 2 | Product_Info_4 | 0.85 |
| 70 | Medical_History_40 | 0.84 |
| 114 | Medical_Keyword_43 | 0.79 |
| 83 | Medical_Keyword_12 | 0.75 |
| 49 | Medical_History_17 | 0.61 |
| 77 | Medical_Keyword_6 | 0.61 |
| 116 | Medical_Keyword_45 | 0.60 |
| 44 | Medical_History_11 | 0.57 |
| 62 | Medical_History_31 | 0.50 |
| 100 | Medical_Keyword_29 | 0.49 |
| 34 | Family_Hist_4 | 0.43 |
| 55 | Medical_History_23 | 0.38 |
| 32 | Family_Hist_2 | 0.38 |
| 21 | InsuredInfo_6 | 0.36 |
| 41 | Medical_History_7 | 0.35 |
| 101 | Medical_Keyword_30 | 0.33 |
| 97 | Medical_Keyword_26 | 0.33 |
| 93 | Medical_Keyword_22 | 0.32 |
| 69 | Medical_History_39 | 0.30 |
| 47 | Medical_History_14 | 0.29 |
| 91 | Medical_Keyword_20 | 0.27 |
| 58 | Medical_History_27 | 0.25 |
| 37 | Medical_History_3 | 0.25 |
| 115 | Medical_Keyword_44 | 0.24 |
| 81 | Medical_Keyword_10 | 0.23 |
| 31 | Family_Hist_1 | 0.21 |
| 110 | Medical_Keyword_39 | 0.21 |
| 96 | Medical_Keyword_25 | 0.20 |
| 25 | Insurance_History_3 | 0.19 |
| 78 | Medical_Keyword_7 | 0.18 |
| 98 | Medical_Keyword_27 | 0.18 |
| 54 | Medical_History_22 | 0.17 |
| 46 | Medical_History_13 | 0.15 |
| 5 | Product_Info_7 | 0.14 |
| 87 | Medical_Keyword_16 | 0.14 |
| 14 | Employment_Info_5 | 0.13 |
| 103 | Medical_Keyword_32 | 0.13 |
| 76 | Medical_Keyword_5 | 0.13 |
| 19 | InsuredInfo_4 | 0.12 |
| 105 | Medical_Keyword_34 | 0.12 |
| 104 | Medical_Keyword_33 | 0.11 |
| 28 | Insurance_History_7 | 0.11 |
| 73 | Medical_Keyword_2 | 0.10 |
| 60 | Medical_History_29 | 0.10 |
| 15 | Employment_Info_6 | 0.09 |
| 108 | Medical_Keyword_37 | 0.08 |
| 85 | Medical_Keyword_14 | 0.08 |
| 26 | Insurance_History_4 | 0.08 |
| 79 | Medical_Keyword_8 | 0.07 |
| 29 | Insurance_History_8 | 0.07 |
| 90 | Medical_Keyword_19 | 0.07 |
| 63 | Medical_History_33 | 0.06 |
| 10 | Employment_Info_1 | 0.06 |
| 71 | Medical_History_41 | 0.06 |
| 92 | Medical_Keyword_21 | 0.06 |
| 107 | Medical_Keyword_36 | 0.04 |
| 118 | Medical_Keyword_47 | 0.02 |
| 64 | Medical_History_34 | 0.02 |
| 30 | Insurance_History_9 | 0.01 |
| 53 | Medical_History_21 | 0.01 |
| 82 | Medical_Keyword_11 | -0.00 |
| 1 | Product_Info_3 | -0.00 |
| 36 | Medical_History_2 | 0.00 |
| 18 | InsuredInfo_3 | -0.00 |
| 35 | Medical_History_1 | 0.00 |
| 11 | Employment_Info_2 | -0.00 |
| 40 | Medical_History_6 | -0.01 |
| 48 | Medical_History_16 | -0.01 |
| 99 | Medical_Keyword_28 | -0.01 |
| 66 | Medical_History_36 | -0.02 |
| 88 | Medical_Keyword_17 | -0.03 |
| 67 | Medical_History_37 | -0.03 |
| 27 | Insurance_History_5 | -0.03 |
| 89 | Medical_Keyword_18 | -0.04 |
| 4 | Product_Info_6 | -0.06 |
| 72 | Medical_Keyword_1 | -0.07 |
| 43 | Medical_History_9 | -0.07 |
| 12 | Employment_Info_3 | -0.08 |
| 95 | Medical_Keyword_24 | -0.09 |
| 42 | Medical_History_8 | -0.12 |
| 13 | Employment_Info_4 | -0.12 |
| 59 | Medical_History_28 | -0.12 |
| 113 | Medical_Keyword_42 | -0.14 |
| 45 | Medical_History_12 | -0.15 |
| 117 | Medical_Keyword_46 | -0.16 |
| 94 | Medical_Keyword_23 | -0.17 |
| 56 | Medical_History_25 | -0.19 |
| 84 | Medical_Keyword_13 | -0.20 |
| 16 | InsuredInfo_1 | -0.21 |
| 111 | Medical_Keyword_40 | -0.23 |
| 23 | Insurance_History_1 | -0.23 |
| 57 | Medical_History_26 | -0.27 |
| 24 | Insurance_History_2 | -0.32 |
| 119 | Medical_Keyword_48 | -0.33 |
| 51 | Medical_History_19 | -0.40 |
| 50 | Medical_History_18 | -0.41 |
| 20 | InsuredInfo_5 | -0.43 |
| 102 | Medical_Keyword_31 | -0.45 |
| 22 | InsuredInfo_7 | -0.47 |
| 75 | Medical_Keyword_4 | -0.48 |
| 68 | Medical_History_38 | -0.50 |
| 80 | Medical_Keyword_9 | -0.65 |
| 3 | Product_Info_5 | -0.69 |
| 65 | Medical_History_35 | -0.79 |
| 0 | Product_Info_1 | -0.81 |
| 6 | Ins_Age | -1.00 |
| 109 | Medical_Keyword_38 | -1.35 |
| 106 | Medical_Keyword_35 | -1.77 |
| 61 | Medical_History_30 | -1.80 |
| 39 | Medical_History_5 | -2.12 |
| 17 | InsuredInfo_2 | -2.23 |
| 86 | Medical_Keyword_15 | -2.25 |
| 74 | Medical_Keyword_3 | -3.25 |
| 8 | Wt | -4.11 |
| 9 | BMI | -8.62 |
# Plotting feature vs importance
fig = plt.figure(figsize = (15, 8))
values =importance_df_lr[importance_df_lr['Feature Importance']>0]['Feature Importance'].values
features = importance_df_lr[importance_df_lr['Feature Importance']>0]['Feature'].values
plt.bar(features, values, color ='blue',
width = 0.4)
plt.xticks( rotation='vertical')
plt.show()
And again the same pattern when doing feature importance
# Interpretting the model using lime
interpret_with_lime(lr_model,X_test)
Only BMI and medical history 4 pushing towards class 0
# Appending all the models to estimators list
estimators = []
estimators.append(('logistic', lr_optimal_model))
estimators.append(('XGB', xgb_optimal_model))
estimators.append(('GB', gb_optimal_model))
estimators.append(('rf', rf_optimal_model))
# create the voting model
voting_model = VotingClassifier(estimators, voting='soft')
voting_model.fit(X_train, Y_train)
VotingClassifier(estimators=[('logistic',
LogisticRegression(C=10, max_iter=5000)),
('XGB',
XGBClassifier(base_score=0.5, booster='gbtree',
callbacks=None, colsample_bylevel=1,
colsample_bynode=1,
colsample_bytree=1,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=None, gamma=0,
gpu_id=-1, grow_policy='depthwise',
importance_type=None,
interaction...
max_delta_step=0, max_depth=5,
max_leaves=0, min_child_weight=1,
missing=nan,
monotone_constraints='()',
n_estimators=10, n_jobs=0,
num_parallel_tree=1,
predictor='auto', random_state=0,
reg_alpha=0, reg_lambda=1, ...)),
('GB',
GradientBoostingClassifier(max_depth=5,
n_estimators=250)),
('rf',
RandomForestClassifier(max_depth=8,
min_samples_leaf=40,
min_samples_split=150))],
voting='soft')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. VotingClassifier(estimators=[('logistic',
LogisticRegression(C=10, max_iter=5000)),
('XGB',
XGBClassifier(base_score=0.5, booster='gbtree',
callbacks=None, colsample_bylevel=1,
colsample_bynode=1,
colsample_bytree=1,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=None, gamma=0,
gpu_id=-1, grow_policy='depthwise',
importance_type=None,
interaction...
max_delta_step=0, max_depth=5,
max_leaves=0, min_child_weight=1,
missing=nan,
monotone_constraints='()',
n_estimators=10, n_jobs=0,
num_parallel_tree=1,
predictor='auto', random_state=0,
reg_alpha=0, reg_lambda=1, ...)),
('GB',
GradientBoostingClassifier(max_depth=5,
n_estimators=250)),
('rf',
RandomForestClassifier(max_depth=8,
min_samples_leaf=40,
min_samples_split=150))],
voting='soft')LogisticRegression(C=10, max_iter=5000)
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
early_stopping_rounds=None, enable_categorical=False,
eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
importance_type=None, interaction_constraints='',
learning_rate=0.5, max_bin=256, max_cat_to_onehot=4,
max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=1,
missing=nan, monotone_constraints='()', n_estimators=10, n_jobs=0,
num_parallel_tree=1, predictor='auto', random_state=0,
reg_alpha=0, reg_lambda=1, ...)GradientBoostingClassifier(max_depth=5, n_estimators=250)
RandomForestClassifier(max_depth=8, min_samples_leaf=40, min_samples_split=150)
# Getting all the scores and errors
voting_model, voting_train_auc, voting_test_auc, voting_train_accuracy, voting_test_accuracy, voting_f1, voting_precision, voting_recall, voting_train_log, voting_test_log = check_scores(voting_model, X_train, X_test )
The accuracy on train dataset is 0.8390479398226114 The accuracy on test dataset is 0.8287754277246396 Train confusion matrix: [[26517 3390] [ 3778 10850]] Test confusion matrix: [[8826 1159] [1383 3478]] ROC on train data: 0.9176653029550663 ROC on test data: 0.9043878304599463 Train log loss: 0.35845284864510235 Test log loss: 0.37523507156321295 F score is: 0.7323647083596546 Precision is: 0.7500539141686435 Recall is: 0.7154906397860522
#Building a stacked classifier
stacked_classifier = StackingClassifier(classifiers =[lr_optimal_model, xgb_optimal_model, gb_model], meta_classifier = RandomForestClassifier(), use_probas = True, use_features_in_secondary = True)
# training of stacked model
stacked_model = stacked_classifier.fit(X_train, Y_train)
stacked_model, stacked_train_auc, stacked_test_auc, stacked_train_accuracy, stacked_test_accuracy, stacked_f1, stacked_precision, stacked_recall, stacked_train_log, stacked_test_log = check_scores(stacked_model, X_train, X_test )
The accuracy on train dataset is 0.9999550915010665 The accuracy on test dataset is 0.8303246665768557 Train confusion matrix: [[29907 0] [ 2 14626]] Test confusion matrix: [[8733 1252] [1267 3594]] ROC on train data: 1.0 ROC on test data: 0.9054788209881166 Train log loss: 0.07644879208172911 Test log loss: 0.3941304875937328 F score is: 0.7404965488822499 Precision is: 0.741642591828312 Recall is: 0.7393540423781115
# Making a dataframe of all the scores for every model
scores_ = [("Random Forest", rf_train_auc, rf_test_auc, rf_train_accuracy, rf_test_accuracy,rf_train_log, rf_test_log,rf_f1, rf_precision, rf_recall),
("Gradient Boosting", gb_train_auc, gb_test_auc, gb_train_accuracy, gb_test_accuracy,gb_train_log, gb_test_log,gb_f1, gb_precision,gb_recall,),
("XG Boost", xgb_train_auc, xgb_test_auc, xgb_train_accuracy, xgb_test_accuracy,xgb_train_log, xgb_test_log,xgb_f1, xgb_precision, xgb_recall),
("Logistic Regression", lr_train_auc, lr_test_auc, lr_train_accuracy, lr_test_accuracy,lr_train_log, lr_test_log,lr_f1, lr_precision, lr_recall,),
("Voting Classifier", voting_train_auc, voting_test_auc, voting_train_accuracy, voting_test_accuracy, voting_train_log, voting_test_log, voting_f1, voting_precision, voting_recall),
("Stacked Model", stacked_train_auc, stacked_test_auc, stacked_train_accuracy, stacked_test_accuracy, stacked_train_log, stacked_test_log, stacked_f1, stacked_precision, stacked_recall)]
Scores_ =pd.DataFrame(data = scores_, columns=['Model Name', 'Train ROC', 'Test ROC', 'Train Accuracy', 'Test Accuracy', 'Train Log Loss','Test Log Loss','F-Score', 'Precision','Recall',])
Scores_.set_index('Model Name', inplace = True)
Scores_
| Train ROC | Test ROC | Train Accuracy | Test Accuracy | Train Log Loss | Test Log Loss | F-Score | Precision | Recall | |
|---|---|---|---|---|---|---|---|---|---|
| Model Name | |||||||||
| Random Forest | 0.892154 | 0.886213 | 0.810307 | 0.806143 | 0.424388 | 0.428711 | 0.667667 | 0.760990 | 0.594734 |
| Gradient Boosting | 0.937822 | 0.909214 | 0.864489 | 0.833760 | 0.303325 | 0.353909 | 0.747493 | 0.743538 | 0.751491 |
| XG Boost | 0.907071 | 0.901588 | 0.830673 | 0.825273 | 0.360219 | 0.368680 | 0.739715 | 0.722037 | 0.758280 |
| Logistic Regression | 0.885306 | 0.881183 | 0.813226 | 0.810252 | 0.395646 | 0.401173 | 0.701241 | 0.723730 | 0.680107 |
| Voting Classifier | 0.917665 | 0.904388 | 0.839048 | 0.828775 | 0.358453 | 0.375235 | 0.732365 | 0.750054 | 0.715491 |
| Stacked Model | 1.000000 | 0.905479 | 0.999955 | 0.830325 | 0.076449 | 0.394130 | 0.740497 | 0.741643 | 0.739354 |
Gradient Boosting, Voting Classifier and Stacked models are performing really well. Their train and test errors and also the roc scores and f scores are really close and good.